/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.quality; import java.io.*; import java.util.*; import java.util.logging.*; import net.nutch.util.*; import net.nutch.quality.dynamic.*; /*************************************************** * The QualityTestTool runs a bunch of tests * against both Nutch and external search engines. * The inputs for the actual quality metric computation * can be precomputed using ResultSetGenerator and * URLInsetGenerator. OR, QualityTestTool can run * those programs for you. * * Whether the values have been computed before, or * whether you ask QualityTestTool to do it, they * remain where they are. We do not delete the results * of the test, in case the user wants to run computations * again without all the work of building the test * material. * * We compute two metrics that are interesting. The * first compares page-coverage among the engines. * The second compares ranking-quality. We emit * statistics for both. * * @author Mike Cafarella **************************************************/ public class QualityTestTool { final static String UNIQUE_URLS = "uniqueURLs.bin"; final static String QUERY_LIST = "queryList.txt"; final static String URL_INSET_SUFFIX = ".urlInset"; final static String QUERY_RESULTS_SUFFIX = ".queryResults"; final static String ENGINE_DESC_SUFFIX = ".src"; final static String NUTCH_LABEL = "Nutch"; public static final Logger LOG = LogFormatter.getLogger("net.nutch.quality.QualityTestTool"); File inputsDir; TreeMap engineInsetData = null; /** * The inputs are given to us. */ public QualityTestTool(File inputsDir) { this.inputsDir = inputsDir; } /** * We need to compute all the inputs ourselves. */ public QualityTestTool(File externalEngines, String userAgent, String segmentsDirs[], File queryList) throws IOException, ParseException { this.inputsDir = new File("localInputs"); if (inputsDir.exists()) { throw new IOException("Cannot run QualityTestTool. File " + inputsDir + " already exists"); } inputsDir.mkdirs(); createInputs(externalEngines, userAgent, queryList, segmentsDirs); } /** * Given a directory full of search engine descriptions, * a directory of Nutch segments, and a list of queries, * we need to build the necessary files for a Quality Test. * * Everything is placed in the directory at "inputsDir". */ private void createInputs(File externalEngines, String userAgent, File queryList, String segmentsDirs[]) throws IOException, ParseException { // // 1st, just copy the query list // LOG.info("CreateInputs, 1 of 6: Copying query list..."); File targetQueryList = new File(inputsDir, QUERY_LIST); FileUtil.copyContents(queryList, targetQueryList, true); // // 2nd, test the queryList against all the remote // search engines. Use ResultSetGenerator for this. // LOG.info("CreateInputs, 2 of 6: Testing queries against remote engines..."); File engineDescs[] = externalEngines.listFiles(); for (int i = 0; i < engineDescs.length; i++) { String filename = engineDescs[i].getName(); if (filename.endsWith(ENGINE_DESC_SUFFIX)) { // Compute the engine name int suffixStart = filename.lastIndexOf(ENGINE_DESC_SUFFIX); String engineName = filename.substring(0, suffixStart); PageExtractor.IExtractor extractor = new PageExtractor.RemotePageExtractor(engineDescs[i], userAgent, false); ResultSetGenerator rsg = new ResultSetGenerator(extractor, false); rsg.processQueries(targetQueryList, new File(inputsDir, engineName + QUERY_RESULTS_SUFFIX)); } } // // 3rd, test the queryList against the Nutch segments. // LOG.info("CreateInputs, 3 of 6: Testing queries against local Nutch segments..."); for (int i = 0; i < segmentsDirs.length; i++) { PageExtractor.IExtractor extractor = new PageExtractor.NutchExtractor(segmentsDirs[i]); ResultSetGenerator rsg = new ResultSetGenerator(extractor, false); rsg.processQueries(targetQueryList, new File(inputsDir, NUTCH_LABEL + "." + i + QUERY_RESULTS_SUFFIX)); } // // 4th, now that we have all the queryResults, we // compute the uniquified URL list. This is written // to a file in inputsDir. // // Note we may have a value "minSupport". If the percentage // of engines that include a given term in the top-10 list // is >= minSupport, then it is included in the uniquified // list. Otherwise, the term doesn't make it. This approach // lets us remove wholly "idiosyncratic" URLs from the test // pool. // // Of course, if minSupport is 0 then all items will pass // the guard. // LOG.info("CreateInputs, 4 of 6: Computing unique URL set..."); TreeMap returnedURLSets = new TreeMap(); SortedMap uniqueMap = new TreeMap(); File resultLists[] = inputsDir.listFiles(); for (int i = 0; i < resultLists.length; i++) { String filename = resultLists[i].getName(); if (filename.endsWith(QUERY_RESULTS_SUFFIX)) { // Compute the engine name int suffixStart = filename.lastIndexOf(QUERY_RESULTS_SUFFIX); String engineName = filename.substring(0, suffixStart); // Store all the URLs returned by this engine SortedSet returnedURLSet = new TreeSet(); returnedURLSets.put(engineName, returnedURLSet); // Load in the engine's result set DataInputStream in = new DataInputStream(new FileInputStream(resultLists[i])); try { int numQueries = in.readInt(); for (int j = 0; j < numQueries; j++) { String query = in.readUTF(); int numResults = in.readInt(); for (int k = 0; k < numResults; k++) { String foundURL = in.readUTF(); // Remember all the URLs from this engine returnedURLSet.add(foundURL); } } } finally { in.close(); } } } // // Figure out whether each term enjoys enough support // to make it into the unique set. // // Go through all known sets... for (Iterator it = returnedURLSets.values().iterator(); it.hasNext(); ) { SortedSet curSet = (SortedSet) it.next(); // And iterate through the terms of each set... for (Iterator it2 = curSet.iterator(); it2.hasNext(); ) { String term = (String) it2.next(); // Testing each term to make sure it is common enough... int containsCount = 0; for (Iterator it3 = returnedURLSets.values().iterator(); it3.hasNext(); ) { SortedSet testSet = (SortedSet) it3.next(); if (testSet.contains(term)) { containsCount++; } } // Before inserting the term into the uniquified pool. uniqueMap.put(term, new Integer(containsCount)); } } // Now write out the unique URL set File uniqueURLs = new File(inputsDir, UNIQUE_URLS); DataOutputStream out = new DataOutputStream(new FileOutputStream(uniqueURLs)); try { out.writeInt(uniqueMap.size()); for (Iterator it = uniqueMap.keySet().iterator(); it.hasNext(); ) { String url = (String) it.next(); Integer count = (Integer) uniqueMap.get(url); out.writeUTF(url); out.writeInt(count.intValue()); } } finally { out.close(); } // // 5th, we test each remote search engine to see // if it contains each unique URL. We write the // results of each test to inputsDir. Use URLInsetTester // for this. // LOG.info("CreateInputs, 5 of 6: Test membership of each URL in every remote engine..."); for (int i = 0; i < engineDescs.length; i++) { String filename = engineDescs[i].getName(); if (filename.endsWith(ENGINE_DESC_SUFFIX)) { // Compute the engine name int suffixStart = filename.lastIndexOf(ENGINE_DESC_SUFFIX); String engineName = filename.substring(0, suffixStart); // Test the URLs to see if they are in-set PageExtractor.RemotePageExtractor extractor = new PageExtractor.RemotePageExtractor(engineDescs[i], userAgent, false); URLInsetTester uit = new URLInsetTester(extractor, false); uit.testURLs(uniqueURLs, (TreeSet) returnedURLSets.get(engineName), new File(inputsDir, engineName + URL_INSET_SUFFIX)); } } // // 6th, we test Nutch to see if it contains each // unique URL. Write the results to inputsDir. // LOG.info("CreateInputs, 6 of 6: Test membership of each URL in local Nutch segments..."); for (int i = 0; i < segmentsDirs.length; i++) { PageExtractor.NutchExtractor extractor = new PageExtractor.NutchExtractor(segmentsDirs[i]); URLInsetTester uit = new URLInsetTester(extractor, false); uit.testURLs(uniqueURLs, (TreeSet) returnedURLSets.get(NUTCH_LABEL + "." + i), new File(inputsDir, NUTCH_LABEL + "." + i + URL_INSET_SUFFIX)); } } /** * This assumes we have a directory full of all the information * we need. We look in inputsDir for files of this format: * * "queryList.txt" * "uniqueURLs.txt" * "searchEngineNameA.queryResults" * "searchEngineNameA.urlInset" * "searchEngineNameB.queryResults" * "searchEngineNameB.urlInset" * ... * * We assume that queryList.txt has only a few hundred * items in it, tops. If that assumption doesn't hold, * then this code might not be efficent enough. */ public void runTests(boolean testCoverage, boolean testOrdering, double coverageConsensus) throws IOException { // // Part I. Compute the coverage numbers. // if (testCoverage) { computeCoverageScore(coverageConsensus); System.out.println(); System.out.println(); } // // Part I.5. Compute the 'top-10 eccentric' score // if (testCoverage && (coverageConsensus > 0.0)) { computeEccentricScore(coverageConsensus); System.out.println(); System.out.println(); } // // Part II. Compute the ordering scores. // // if (testOrdering) { computeOrderingScore(); } } /** * Compute page-coverage over all the engines. * Uses information stored in "inputsDir". */ private void computeCoverageScore(double coverageConsensus) throws IOException { // // 1. Figure out how many engines we're testing, and how // many times a term needs to appear to satisfy "coverageConsensus" // int numEngines = 0; File contents[] = inputsDir.listFiles(); for (int i = 0; i < contents.length; i++) { String filename = contents[i].getName(); if (filename.endsWith(URL_INSET_SUFFIX)) { // Compute the engine name numEngines++; } } int requiredCount = (int) Math.ceil(numEngines * coverageConsensus); System.out.println("URL must be present in at least " + requiredCount + " (" + coverageConsensus + ") item(s)"); // // 2. Load in the complete uniquified URL list, along // with counts of how many engines have the URL. // Don't include terms that fail to satisfy coverageConsensus // TreeMap uniqueURLs = new TreeMap(); DataInputStream in = new DataInputStream(new FileInputStream(new File(inputsDir, UNIQUE_URLS))); try { int numItems = in.readInt(); for (int i = 0; i < numItems; i++) { String url = in.readUTF(); int count = in.readInt(); if (count >= requiredCount) { uniqueURLs.put(url, new Integer(count)); } } } finally { in.close(); } // // 3. Go through each engine and load in the list // of inset-URLs. // TreeMap urlInsetScores = new TreeMap(); int maxInsetScore = uniqueURLs.size(); for (int i = 0; i < contents.length; i++) { String filename = contents[i].getName(); if (filename.endsWith(URL_INSET_SUFFIX)) { // Compute the engine name int suffixStart = filename.lastIndexOf(URL_INSET_SUFFIX); String engineName = filename.substring(0, suffixStart); // Load in the engine's url-inset list int insetScore = 0; DataInputStream din = new DataInputStream(new FileInputStream(contents[i])); try { int numItems = din.readInt(); // // Load in whether each URL was in-set or not. // If it was in-set for the engine, and is in // the qualified unique set overall, then the // engine gets a point. // for (int j = 0; j < numItems; j++) { String url = din.readUTF(); if (din.readBoolean() && uniqueURLs.get(url) != null) { insetScore++; } } } finally { din.close(); } // When done processing this file, store the score urlInsetScores.put(engineName, new Integer(insetScore)); } } // // Third, output the coverage statistics // System.out.println("Engine\t\tCoverage Score"); System.out.println("--------------------------------"); for (Iterator it = urlInsetScores.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); int score = ((Integer) urlInsetScores.get(engineName)).intValue(); System.out.println(engineName + "\t\t" + score + " of " + maxInsetScore + "\t(" + ((score / (1.0 * maxInsetScore)) * 100) + "%)"); } } /** * Figure out how many of an engine's URLs are * not "eccentric". That is, the URL WILL appear * in at least "coverageConsensus" percentage of * the engines' results. */ private void computeEccentricScore(double coverageConsensus) throws IOException { // // 1. Figure out how many engines we're testing, and how // many times a term needs to appear to satisfy "coverageConsensus" // int numEngines = 0; File contents[] = inputsDir.listFiles(); for (int i = 0; i < contents.length; i++) { String filename = contents[i].getName(); if (filename.endsWith(URL_INSET_SUFFIX)) { // Compute the engine name numEngines++; } } int requiredCount = (int) Math.ceil(numEngines * coverageConsensus); System.out.println("URL must be present in at least " + requiredCount + " (" + coverageConsensus + ") item(s)"); // // 2. Load in the complete uniquified URL list, along // with counts of how many engines have the URL. // Don't include terms that fail to satisfy coverageConsensus // TreeMap sharedURLs = new TreeMap(); DataInputStream in = new DataInputStream(new FileInputStream(new File(inputsDir, UNIQUE_URLS))); try { int numItems = in.readInt(); for (int i = 0; i < numItems; i++) { String url = in.readUTF(); int count = in.readInt(); if (count >= requiredCount) { sharedURLs.put(url, new Integer(count)); } } } finally { in.close(); } // // 3. Go through each engine and load in its // top-10 list. Check to see if each URL in // this set is also present in the "sharedURLs" // table. The ratio of in-top-10 vs in-shared-set // is the value we're after for each engine. // TreeMap engineURLs = new TreeMap(), engineSharedURLs = new TreeMap(); File resultFiles[] = inputsDir.listFiles(); for (int i = 0; i < resultFiles.length; i++) { String filename = resultFiles[i].getName(); if (filename.endsWith(QUERY_RESULTS_SUFFIX)) { // Compute engine name int suffixStart = filename.lastIndexOf(QUERY_RESULTS_SUFFIX); String engineName = filename.substring(0, suffixStart); // Load in results in = new DataInputStream(new FileInputStream(resultFiles[i])); int engineTopURLs = 0, inSharedSet = 0; try { int numQueries = in.readInt(); for (int j = 0; j < numQueries; j++) { String query = in.readUTF(); int numResults = in.readInt(); for (int k = 0; k < numResults; k++) { String result = in.readUTF(); engineTopURLs++; if (sharedURLs.containsKey(result)) { inSharedSet++; } } } engineURLs.put(engineName, new Integer(engineTopURLs)); engineSharedURLs.put(engineName, new Integer(inSharedSet)); } finally { in.close(); } } } // // 4. Output stats // System.out.println("Engine\t\tIn-shared-set score"); System.out.println("--------------------------------"); for (Iterator it = engineURLs.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); int urlScore = ((Integer) engineURLs.get(engineName)).intValue(); int sharedScore = ((Integer) engineSharedURLs.get(engineName)).intValue(); System.out.println(engineName + "\t\t" + sharedScore + " of " + urlScore + "\t(" + ((sharedScore / (1.0 * urlScore)) * 100) + "%)"); } } /** * Compute numbers that tell us how good the orderings are. * * Part of this test involves using the MarkovRankSolver to * compute a "best group-contribution ranking" that minimizes * the overall Kendall Tau distance between the complete * ranking and each contributing sublist. */ private void computeOrderingScore() throws IOException { // // For an engine to say anything about two items, // both must be in-set, and at least one must be in // the top-10 list. // // Before we do anything, load in the result lists // and the URL-inset data. // TreeMap engineResults = new TreeMap(); engineInsetData = new TreeMap(); File resultFiles[] = inputsDir.listFiles(); for (int i = 0; i < resultFiles.length; i++) { String filename = resultFiles[i].getName(); if (filename.endsWith(QUERY_RESULTS_SUFFIX)) { // Compute engine name int suffixStart = filename.lastIndexOf(QUERY_RESULTS_SUFFIX); String engineName = filename.substring(0, suffixStart); // Load in results DataInputStream in = new DataInputStream(new FileInputStream(resultFiles[i])); try { TreeMap resultLists = new TreeMap(); int numQueries = in.readInt(); for (int j = 0; j < numQueries; j++) { String query = in.readUTF(); int numResults = in.readInt(); String resultList[] = new String[numResults]; for (int k = 0; k < numResults; k++) { resultList[k] = in.readUTF(); } resultLists.put(query, resultList); } engineResults.put(engineName, resultLists); } finally { in.close(); } // Next, load in the inset-data in = new DataInputStream(new FileInputStream(new File(inputsDir, engineName + URL_INSET_SUFFIX))); try { TreeSet insetURLs = new TreeSet(); int numItems = in.readInt(); for (int j = 0; j < numItems; j++) { String url = in.readUTF(); if (in.readBoolean()) { insetURLs.add(url); } } engineInsetData.put(engineName, insetURLs); } finally { in.close(); } } } // // We now have two large useful structures. // // A. engineResults is a Map that maps engine names // to another Map. The value Map maps Queries to // String Arrays of results. // // B. engineInsetData is a Map that maps engine names // to a Set. This Set contains all the relevant URLs // that the engine has indexed. // // // Figure out all the pairwise statements that // come from an engine's top-10 list (not including // the ones from position 11 and lower). // // // Each engine should have identical keys listed in // its Map from query terms to Arrays of results. So // just pick the first one from 'engineResults'. // Map defaultQueryMap = (Map) engineResults.get((String) engineResults.firstKey()); Map overallDistances = new TreeMap(), bestPageScores = new TreeMap(); for (Iterator it = engineResults.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); overallDistances.put(engineName, new Double(0.0)); bestPageScores.put(engineName, new Double(0.0)); } // Iterate through every query. for (Iterator it = defaultQueryMap.keySet().iterator(); it.hasNext(); ) { String query = (String) it.next(); // // Go through every engine, finding the results for the query. // Build a good full-ordering using the Markov solver // MarkovRankSolver fullOrdering = new MarkovRankSolver(); for (Iterator it2 = engineResults.keySet().iterator(); it2.hasNext(); ) { String engineName = (String) it2.next(); Map queryMap = (Map) engineResults.get(engineName); // Get results from this engine for the current query String results[] = (String[]) queryMap.get(query); fullOrdering.addOrdering(results); } fullOrdering.solveRanking(); int numMarkovStates = fullOrdering.getNumStates(); // // For each engine, compute the DT distance to the full-ordering // for (Iterator it2 = engineResults.keySet().iterator(); it2.hasNext(); ) { String engineName = (String) it2.next(); Map queryMap = (Map) engineResults.get(engineName); String results[] = (String[]) queryMap.get(query); // Find how many binary misorderings there are between // the results list and the full markov list double curDistance = 0.0; if (results.length > 1) { curDistance = fullOrdering.getKendallTauDistance(results, true); } double oldScore = ((Double) overallDistances.get(engineName)).doubleValue(); overallDistances.put(engineName, new Double(oldScore + curDistance)); } // // For each engine, compute the "Best Pages score", which // measures how close each engine's top-10 list matches the // Markov model's top-10 list. // for (Iterator it2 = engineResults.keySet().iterator(); it2.hasNext(); ) { String engineName = (String) it2.next(); Map queryMap = (Map) engineResults.get(engineName); String results[] = (String[]) queryMap.get(query); // // Assign a score for each item in our results // list. double newScore = 0.0; for (int i = 0; i < results.length; i++) { int markovPos = fullOrdering.getPos(results[i]); newScore += (numMarkovStates - markovPos); } double oldScore = ((Double) bestPageScores.get(engineName)).doubleValue(); bestPageScores.put(engineName, new Double(oldScore + newScore)); } } // Emit score to stdout System.out.println("Engine\t\tNormalized Kendall Tau Distance"); System.out.println("--------------------------------"); for (Iterator it = overallDistances.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); Double fullDistance = (Double) overallDistances.get(engineName); System.out.println(engineName + "\t\t" + fullDistance); } System.out.println(); System.out.println(); System.out.println("Engine\t\t'Best-Page Score'"); System.out.println("--------------------------------"); for (Iterator it = bestPageScores.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); Double score = (Double) bestPageScores.get(engineName); System.out.println(engineName + "\t\t" + score); } } /** * Take all the file-args we need to compute test results. */ public static void main(String argv[]) throws IOException, ParseException { if (argv.length < 2) { System.out.println("Usage: java net.nutch.quality.QualityTestTool (-initTest <externalEngineDirectory> <userAgent> <queryList> [-nutchSegment <segmentsDirectory0>] [-nutchSegment <segmentsDirectory1>] ... [-nutchSegmentSet <segmentDir>]) | (-repeatTest <existingWorkDir>) [-coverageConsensus <double>] [-noCoverageTest] [-noOrderingTest]"); System.out.println(); System.out.println("Note that 'coverageConsensus' should be a value between 0.0 and 1.0"); return; } // vars for parsing command-line options File extEngineDescs = null, queryList = null; File existingWorkDir = null; String userAgent = null; Vector nutchSegments = new Vector(); boolean initTest = false, repeatTest = false; boolean testCoverage = true, testOrdering = true; double coverageConsensus = 0.0; // loop through cmd args for (int i = 0; i < argv.length; i++) { if ("-initTest".equals(argv[i])) { extEngineDescs = new File(argv[i + 1]); userAgent = argv[i + 2]; queryList = new File(argv[i + 3]); i += 3; initTest = true; } else if ("-repeatTest".equals(argv[i])) { existingWorkDir = new File(argv[i+1]); repeatTest = true; i++; } else if ("-nutchSegment".equals(argv[i])) { nutchSegments.add(new File(argv[i + 1]).getPath()); i++; } else if ("-nutchSegmentSet".equals(argv[i])) { File segmentSet = new File(argv[i+1]); File segmentSubdirs[] = segmentSet.listFiles(); for (int j = 0; i < segmentSubdirs.length; j++) { if (segmentSubdirs[j].isDirectory()) { nutchSegments.add(segmentSubdirs[i].getPath()); } } i++; } else if ("-coverageConsensus".equals(argv[i])) { coverageConsensus = Double.parseDouble(argv[i+1]); i++; } else if ("-noCoverageTest".equals(argv[i])) { testCoverage = false; } else if ("-noOrderingTest".equals(argv[i])) { testOrdering = false; } else { System.out.println("Unknown arg: " + argv[i]); return; } } // Know what kind of test to run if ((initTest && repeatTest) || (! initTest && ! repeatTest)) { System.out.println("Must either 'initTest' or 'repeatTest'"); return; } // Make sure something's being run if (! testCoverage && ! testOrdering) { System.out.println("Must run at least one test."); return; } // Build the QTT QualityTestTool qtt = null; if (initTest) { int i = 0; String segments[] = new String[nutchSegments.size()]; for (Enumeration e = nutchSegments.elements(); e.hasMoreElements(); i++) { segments[i] = (String) e.nextElement(); } qtt = new QualityTestTool(extEngineDescs, userAgent, segments, queryList); } else { qtt = new QualityTestTool(existingWorkDir); } // Kick it off. qtt.runTests(testCoverage, testOrdering, coverageConsensus); } }